Download the images

The MS COCO 2014 dataset has 40,000 images for validation, and 80,000 for training. Text annotations are only available for the training set.

Note that the .zip file is relatively large (about 14 GB), so the following cell will take a long time to execute.

arXiv:1405.0312


In [ ]:
%%bash

mkdir ../data
cd ../data

curl -O http://msvocds.blob.core.windows.net/coco2014/train2014.zip
unzip train2014.zip
rm train2014.zip

Get the annotations

The COCO-Text dataset has 170,000 text instances; about half of the MS COCO images contain text. Annotations are provided for both the training set and the validation set.

arXiv:1601.07140


In [ ]:
%%bash

cd ../data

curl -O https://s3.amazonaws.com/cocotext/COCO_Text.zip
unzip COCO_Text.zip
rm COCO_Text.zip

Split the images into labeled segments


In [ ]:
from __future__ import absolute_import, division, print_function

# The maximum number of training images per class
N_TRAIN = 30000
# The maximum number of validation images per class
N_VALID = 15000
# The minimum overlap area between an annotation and a segment
# if has_text(...) returns True
OVERLAP_THRESHOLD = 500
# The size of the sides of the segments, in pixels
SEGMENT_SIZE = 128

In [ ]:
def filename_to_id(filename):
    # Extract ID
    image_id = filename.split('_')[2].split('.')[0]
    # Remove leading zeros
    image_id = str(int(image_id))
    
    return image_id

In [ ]:
def calculate_overlap(rect1, rect2):
    """Calculate the overlap between two rectangles.
    
    Assumes that the rectangles are provided as 4-tuples, and the values are as follows:
        0. X coordinate of the top-left corner
        1. Y coordinate of the top-left corner
        2. X coordinate of the bottom-right corner
        3. Y coordinate of the bottom-right corner
    """
    
    horizontal_overlap = max(0, min(rect1[2], rect2[2]) - max(rect1[0], rect2[0]))
    vertical_overlap = max(0, min(rect1[3], rect2[3]) - max(rect1[1], rect2[1]))
    
    return horizontal_overlap * vertical_overlap

In [ ]:
def has_text(image_id, segment_rect, annotation_data):
    global OVERLAP_THRESHOLD
    
    try:
        annotation_ids = annotation_data['imgToAnns'][image_id]
    except KeyError:
        annotation_ids = []
    
    annotation_ids = [str(annotation_id) for annotation_id in annotation_ids]
        
    has_text = False
    
    for annotation_id in annotation_ids:
        annotation = annotation_data['anns'][annotation_id]
        
        legible = annotation['legibility'] == 'legible'
        english = annotation['language'] == 'english'
        
        if legible and english:
            bounding_box = annotation['bbox']
            
            x0, y0 = bounding_box[1], bounding_box[0]
            x3, y3 = x0 + bounding_box[3], y0 + bounding_box[2]
            
            annotation_rect = (x0, y0, x3, y3)
            
            overlap = calculate_overlap(annotation_rect, segment_rect)
            
            if overlap > OVERLAP_THRESHOLD:
                has_text = True
                break
                            
    return has_text

In [ ]:
from PIL import Image
from math import ceil
import numpy as np

def get_segments(filename):
    """Loads the specified file from disk and converts it to multiple segments.
    
    The segments are single channel images with size (SEGMENT_SIZE x SEGMENT_SIZE).
    Padding is done using uniformly distributed random values.
    """
    
    global SEGMENT_SIZE
    
    image = Image.open('../data/train2014/' + filename)
    
    segments = []
    segment_rects = []
    
    width, height = image.size
    
    padded_width = int(ceil(width / SEGMENT_SIZE) * SEGMENT_SIZE)
    padded_height = int(ceil(height / SEGMENT_SIZE) * SEGMENT_SIZE)
    
    padded_image = np.random.uniform(0, 256, (padded_height, padded_width))
    
    # Convert the loaded image to grayscale and add random padding
    try:
        b, g, r = image.split()
        gray_image = np.multiply(0.21, r) + np.multiply(0.72, g) + np.multiply(0.07, b)
        padded_image[:height, :width] = gray_image
    except ValueError:
        padded_image[:height, :width] = image
        
    for x in range(0, padded_width, SEGMENT_SIZE):
        for y in range(0, padded_height, SEGMENT_SIZE):
            segment = padded_image[x:x+SEGMENT_SIZE, y:y+SEGMENT_SIZE]
            segments.append(segment)
            
            rect = (x, y, x + SEGMENT_SIZE, y + SEGMENT_SIZE)
            segment_rects.append(rect)
            
    return segments, segment_rects

In [ ]:
from scipy.misc import imsave

def save_image(path, image):
    """Tries to save a given image to a given path.
    
    Returns: the number of images successfully saved. (Either 0 or 1.)
    """
    try:
        imsave(path, image)
    except ValueError:
        return 0
    
    return 1

In [ ]:
%%bash

cd ../data

mkdir train
mkdir train/text
mkdir train/no-text

mkdir valid
mkdir valid/text
mkdir valid/no-text

In [ ]:
from os import walk
import json

images = next(walk('../data/train2014'))[2]
print('The dataset has ' + str(len(images)) + ' images.')

annotation_data = json.load(open('../data/COCO_Text.json'))
print('Text annotations loaded.')

In [ ]:
import random

random.shuffle(images)

In [ ]:
n_text = 0
n_no_text = 0

i_last_training_image = 0

for filename in images:
    if n_text == N_TRAIN and n_no_text == N_TRAIN:
        break
    
    i_last_training_image += 1
    image_id = filename_to_id(filename)
    
    segments, segment_rects = get_segments(filename)
    
    for i in range(len(segments)):
        if has_text(image_id, segment_rects[i], annotation_data) and n_text < N_TRAIN:
            n_text += save_image('../data/train/text/' + str(n_text) + '.jpg', segments[i])
        elif n_no_text < N_TRAIN:
            n_no_text += save_image('../data/train/no-text/' + str(n_no_text) + '.jpg', segments[i])
            
print('Successfully processed: ' + str(n_text + n_no_text) + ' / ' + str(N_TRAIN * 2))

In [ ]:
n_text = 0
n_no_text = 0

for filename in images[i_last_training_image:]:
    if n_text == N_VALID and n_no_text == N_VALID:
        break
    
    image_id = filename_to_id(filename)
    
    segments, segment_rects = get_segments(filename)
    
    for i in range(len(segments)):
        if has_text(image_id, segment_rects[i], annotation_data) and n_text < N_VALID:
            n_text += save_image('../data/valid/text/' + str(n_text) + '.jpg', segments[i])
        elif n_no_text < N_VALID:
            n_no_text += save_image('../data/valid/no-text/' + str(n_no_text) + '.jpg', segments[i])
            
print('Successfully processed: ' + str(n_text + n_no_text) + ' / ' + str(N_VALID * 2))